###############################################################
# 1) Male and female graduate enrollment (1967 - 2015)
###############################################################
# Creating a simplified data frame excluding non profit and for profit schools as well as irrelavent totals
grad_enroll_ftf <- grad_enroll %>%
select(Year, full_time_m, full_time_f, part_time_m, part_time_f)
#creating a linear model for male graduate enrollment
male_grad <- lm(grad_enroll$total_males ~ grad_enroll$Year)
summary(male_grad)
##
## Call:
## lm(formula = grad_enroll$total_males ~ grad_enroll$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96461 -34861 -12841 51876 95766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17112153 1087024 -15.74 <2e-16 ***
## grad_enroll$Year 9069 546 16.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared: 0.8545, Adjusted R-squared: 0.8514
## F-statistic: 276 on 1 and 47 DF, p-value: < 2.2e-16
plot(male_grad)
# Correlation Testing for male graduates:
cor(x = grad_enroll$Year, y = grad_enroll$total_males, use = "everything", method = c("pearson"))
## [1] 0.9243741
# Pearson's R = 0.92
#creating a linear model for female graduate enrollment
female_grad <- lm(grad_enroll$total_females ~ grad_enroll$Year)
summary(female_grad)
##
## Call:
## lm(formula = grad_enroll$total_females ~ grad_enroll$Year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89397 -48101 -7633 45267 129727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.896e+07 1.161e+06 -50.77 <2e-16 ***
## grad_enroll$Year 3.013e+04 5.832e+02 51.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared: 0.9827, Adjusted R-squared: 0.9823
## F-statistic: 2669 on 1 and 47 DF, p-value: < 2.2e-16
plot(female_grad)
# Correlation Testing for male graduates:
cor(x = grad_enroll$Year, y = grad_enroll$total_females, method = c("pearson"))
## [1] 0.9913086
# Pearson's R = 0.99
#create a graph of the models
enroll_graph <- grad_enroll %>%
ggplot(aes(x = Year, y = total_males))+
geom_point(aes(color = "total_males"))+
geom_smooth(aes(x = Year, y = total_males),method = lm, se = TRUE, size = 0.5, color = "gray20") + #plots the linear model with a confidence interval (se)
geom_point(aes(x = Year, y = total_females, color = "total_females")) +
geom_smooth(aes(x = Year, y = total_females),method = lm, se = TRUE, size = 0.5, color = "gray20")+
theme_classic() +
theme(legend.title=element_blank()) +
scale_color_manual(" ", breaks = c("total_males", "total_females"), values = c("total_males" = "royalblue1", "total_females" = "palevioletred1"), labels=c("Males", "Females")) +
labs(x = "\nYear", y = "Total Enrollment\n", title = "Male and Female Graduate Enrollment (1967-2015)\n") +
scale_x_continuous(expand = c(0,0), limits = c(1967, 2015)) +
theme(text = element_text(family = "Times New Roman")) +
theme(plot.title = element_text(hjust = 0.5, size = 13)) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 10))
enroll_graph
###############################################################
# 2) Shifts in female PhD recipients by field (1985, 2000, and 2015)
###############################################################
# First creating a data frame with just the 5 fields in question and just the 3 years of interest.
phds_summary <- phds %>%
select("year", "physci_f", "engineer_f", "ed_f", "humart_f") %>%
filter(year == "1985" | year == "2000" | year == "2015") %>%
select("physci_f", "engineer_f", "ed_f", "humart_f") # I realize this line of code is redundant, but just put it in for my own understanding of the order in which things occurred.
rownames(phds_summary) <- c("1985", "2000", "2015")
## Warning: Setting row names on a tibble is deprecated.
#maybe a chisquare tests for proportions of females in each field by year
phds_chi <- chisq.test(phds_summary)
phds_chi
##
## Pearson's Chi-squared test
##
## data: phds_summary
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
phds_prop <- prop.table(as.matrix(phds_summary), 1)
phds_summary2 <- phds %>%
select("year", "physci_f", "engineer_f", "ed_f", "humart_f") %>%
filter(year == "1985" | year == "2000" | year == "2015")
phds_summary3<-melt(phds_summary2, id.vars = 'year')
phds_graph<- phds_summary3 %>%
ggplot(aes(fill = variable, y = value, x = year)) +
geom_bar(stat = "identity", position = "fill") +
scale_x_continuous(breaks = c(1985, 2000, 2015)) +
scale_y_continuous(expand = c(0,0)) +
theme_classic() +
labs(x = "\nYear", y = "Proportion\n", title = "Proportion of Female PhD Recipients\n") +
theme(text = element_text(family = "Times New Roman")) +
theme(plot.title = element_text(hjust = 0.5, size = 13)) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 10)) +
scale_fill_manual(values = c("lightpink1", "palevioletred1", "violetred", "maroon4"), labels = c("Physical and Earth Sciences", "Engineering", "Education", "Humanities and Arts")) +
theme(legend.title=element_blank())
phds_graph
# p-value < 0.001 therefore there is a significant association between the year a degree was awarded and the number of PhDs awarded to women in that year (X^2 = 2073, *p* , 0.001)
###############################################################
# 3) Male and female salaries for starting postdoctoral and other employment positions (2015)
###############################################################
#2 mann whitney u tests (one for median postdoc salary male vs female, one for median other employment male vs female)
#explore data for posdoc salaries
male_post_sal <- ggplot(med_salary, aes(x = postdoc_m)) +
geom_histogram(binwidth = 5000, aes(color = "black"))
male_post_sal #not normally distributed
# Checking the qq plot for this distribution
male_post_qq <- ggplot(med_salary, aes(sample = postdoc_m)) +
geom_qq()
male_post_qq
#Looking like it's potentially linear
female_post_sal <- ggplot(med_salary, aes(x = postdoc_f)) +
geom_histogram(binwidth = 5000, aes(color = "black"))
female_post_sal #not normally distributed
# Checking the qq plot for this distribution
female_post_qq <- ggplot(med_salary, aes(sample = postdoc_f)) +
geom_qq()
female_post_qq
# NOT looking linear
#want comparison of means so do Mann Whitney U
#H0: Ranks are equal
#HA: ranks are different
post_sal_test <- wilcox.test(med_salary$postdoc_f, med_salary$postdoc_m, paired = TRUE)
## Warning in wilcox.test.default(med_salary$postdoc_f,
## med_salary$postdoc_m, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_salary$postdoc_f,
## med_salary$postdoc_m, : cannot compute exact p-value with zeroes
post_sal_test
##
## Wilcoxon signed rank test with continuity correction
##
## data: med_salary$postdoc_f and med_salary$postdoc_m
## V = 16.5, p-value = 0.8884
## alternative hypothesis: true location shift is not equal to 0
med_sal_2<-melt(med_salary, id.vars = 'field')
post_sal_graph <- med_sal_2 %>%
filter(variable == "postdoc_m" | variable == "postdoc_f") %>%
ggplot(aes(x = field, y = value)) +
geom_col(aes(fill = variable), position = "dodge") +
scale_fill_manual(values = c("royalblue1", "palevioletred1"), labels = c("Male", "Female")) +
theme_classic() +
coord_flip() +
theme(axis.text.x = element_text(hjust = 0.5, size = 8)) +
theme(axis.text.y = element_text(hjust = 1, vjust = 0.5, size = 8)) +
theme(legend.title=element_blank())+
labs(x = "\nField", y = "Median Salary\n", title = "Male and Female Median Postdoc Salaries 2015\n") +
theme(text = element_text(family = "Times New Roman")) +
theme(plot.title = element_text(hjust = 0.5, size = 13)) +
theme(axis.title = element_text(size = 12)) +
scale_y_continuous(expand = c(0,0)) +
scale_x_discrete(labels = function(y) lapply(strwrap(y, width = 25, simplify = FALSE), paste, collapse="\n"))
post_sal_graph
#explore data for employement salaries
male_employ_sal <- ggplot(med_salary, aes(x = employment_m)) +
geom_histogram(binwidth = 15000)
male_employ_sal #maybe normally distributed?
female_employ_sal <- ggplot(med_salary, aes(x = employment_f)) +
geom_histogram(binwidth = 15000)
female_employ_sal #maybe normally distributed?
employ_sal_test <- wilcox.test(med_salary$employment_f, med_salary$employment_m, paired = TRUE)
## Warning in wilcox.test.default(med_salary$employment_f,
## med_salary$employment_m, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(med_salary$employment_f,
## med_salary$employment_m, : cannot compute exact p-value with zeroes
employ_sal_test
##
## Wilcoxon signed rank test with continuity correction
##
## data: med_salary$employment_f and med_salary$employment_m
## V = 4, p-value = 0.002572
## alternative hypothesis: true location shift is not equal to 0
#there is a significant difference, p = .003, so test cliffs delta
employ_sal_cliffs <- cliff.delta(med_salary$employment_f, med_salary$employment_m)
employ_sal_cliffs
##
## Cliff's Delta
##
## delta estimate: -0.2133333 (small)
## 95 percent confidence interval:
## inf sup
## -0.5633695 0.2016324
employ_sal_graph <- med_sal_2 %>%
filter(variable == "employment_m" | variable == "employment_f") %>%
ggplot(aes(x = field, y = value)) +
coord_flip() +
geom_col(aes(fill = variable), position = "dodge") +
scale_fill_manual(values = c("royalblue1", "palevioletred1"), labels = c("Male", "Female")) +
theme_classic() +
theme(axis.text.x = element_text(hjust = 0.5, size = 8)) +
theme(axis.text.y = element_text(hjust = 1, vjust = 0.5, size = 8)) +
theme(legend.title=element_blank())+
labs(x = "\nField", y = "Median Salary\n", title = "Male and Female Median non Postdoc Salaries 2015\n") +
theme(text = element_text(family = "Times New Roman")) +
theme(plot.title = element_text(hjust = 0.5, size = 13)) +
theme(axis.title = element_text(size = 12)) +
scale_y_continuous(expand = c(0,0)) +
scale_x_discrete(labels = function(y) lapply(strwrap(y, width = 25, simplify = FALSE), paste, collapse="\n"))
employ_sal_graph
###############################################################
# 4) Exploring academic salaries for professors in U.S. colleges
###############################################################
# Find a good multiple linear regression model with the output being salary, and the variables (some combination of) sex, discipline, rank, years of service, years since phd
# Some data exploration
#plot of salary v years service
salary_plot <- ggplot (salary_data, aes(x=salary, y=yrs.service))+
geom_point(aes(color=sex), alpha=0.6)
salary_plot
#plot of salary v years service
salary_plot2 <- ggplot (salary_data, aes(x=salary, y=yrs.since.phd))+
geom_point(aes(color=sex), alpha=0.6)
salary_plot2
#plot of salary by faculty rank
facultyrank_plot <- ggplot (salary_data, aes(x=faculty.rank, y=salary))+
geom_point(aes(color=sex))
facultyrank_plot
# well this doesn't seem to be saying much, not seeing any glaring trends here
# summary table looking at salary by sex and faculty position... interesting? maybe.
avg_salary_sex <- salary_data %>%
group_by(sex, faculty.rank) %>%
summarize(mean= mean(salary),
count=n())
avg_salary_sex
sex faculty.rank mean count
# linear model with ALL possible variables
salary_lm1 <- lm (salary ~ sex+faculty.rank+discipline+yrs.since.phd+yrs.service, data=salary_data)
summary(salary_lm1)
Call: lm(formula = salary ~ sex + faculty.rank + discipline + yrs.since.phd + yrs.service, data = salary_data)
Residuals: Min 1Q Median 3Q Max -65248 -13211 -1775 10384 99592
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 78862.8 4990.3 15.803 < 2e-16 sexMale 4783.5 3858.7 1.240 0.21584
faculty.rankAsstProf -12907.6 4145.3 -3.114 0.00198 faculty.rankProf 32158.4 3540.6 9.083 < 2e-16 disciplineB 14417.6 2342.9 6.154 1.88e-09 yrs.since.phd 535.1 241.0 2.220 0.02698
yrs.service -489.5 211.9 -2.310 0.02143 *
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1
Residual standard error: 22540 on 390 degrees of freedom Multiple R-squared: 0.4547, Adjusted R-squared: 0.4463 F-statistic: 54.2 on 6 and 390 DF, p-value: < 2.2e-16
# taking out yrs.since.phd, since it likely pretty much same as years.service; also makes no sense that the years of service would have a negative coefficient
salary_lm2 <- lm(salary ~ sex+faculty.rank+discipline+yrs.since.phd, data=salary_data)
AIC (salary_lm2) # 9097.22
[1] 9097.22
vif(salary_lm2) # all <4
GVIF Df GVIF^(1/(2*Df))
sex 1.028359 1 1.014080 faculty.rank 1.987205 2 1.187301 discipline 1.055727 1 1.027486 yrs.since.phd 2.065517 1 1.437191
summary(salary_lm2)
Call: lm(formula = salary ~ sex + faculty.rank + discipline + yrs.since.phd, data = salary_data)
Residuals: Min 1Q Median 3Q Max -67451 -13860 -1549 10716 97023
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 80988.47 4931.84 16.422 < 2e-16 sexMale 4349.37 3875.39 1.122 0.26242
faculty.rankAsstProf -13104.15 4167.31 -3.145 0.00179 faculty.rankProf 32928.40 3544.40 9.290 < 2e-16 disciplineB 13937.47 2346.53 5.940 6.32e-09 * yrs.since.phd 61.01 127.01 0.480 0.63124
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1
Residual standard error: 22660 on 391 degrees of freedom Multiple R-squared: 0.4472, Adjusted R-squared: 0.4401 F-statistic: 63.27 on 5 and 391 DF, p-value: < 2.2e-16
# taking out yrs.service since that and faculty.rank are likely collinear
salary_lm3 <- lm(salary ~ faculty.rank + discipline + yrs.since.phd + sex + sex*faculty.rank, data=salary_data)
AIC(salary_lm3) # 9100.777
[1] 9100.777
vif(salary_lm3) # really high for faculty.rank and faculty.rank*sex
GVIF Df GVIF^(1/(2*Df))
faculty.rank 60.837395 2 2.792818 discipline 1.059384 1 1.029264 yrs.since.phd 2.077706 1 1.441425 sex 4.168178 1 2.041612 faculty.rank:sex 86.926719 2 3.053432
summary(salary_lm3)
Call: lm(formula = salary ~ faculty.rank + discipline + yrs.since.phd + sex + sex * faculty.rank, data = salary_data)
Residuals: Min 1Q Median 3Q Max -67531 -13938 -1215 10765 96921
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 79193.33 7654.84 10.346 < 2e-16 faculty.rankAsstProf -7848.08 10016.19 -0.784 0.433787
faculty.rankProf 33599.53 9015.90 3.727 0.000223 disciplineB 14028.24 2355.32 5.956 5.79e-09 *** yrs.since.phd 58.23 127.64 0.456 0.648510
sexMale 6464.05 7817.86 0.827 0.408839
faculty.rankAsstProf:sexMale -6308.13 10839.48 -0.582 0.560932
faculty.rankProf:sexMale -874.01 9603.83 -0.091 0.927535
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1
Residual standard error: 22710 on 389 degrees of freedom Multiple R-squared: 0.4478, Adjusted R-squared: 0.4379 F-statistic: 45.07 on 7 and 389 DF, p-value: < 2.2e-16
plot(salary_lm3)
# taking out yrs.since.phd and keeping yrs.service
salary_lm4<- lm (salary~discipline+yrs.service+sex, data= salary_data)
AIC(salary_lm4) # 9257.162
[1] 9257.162
vif(salary_lm4) # all < 4
discipline yrs.service sex 1.028760 1.053649 1.025117
summary(salary_lm4)
Call: lm(formula = salary ~ discipline + yrs.service + sex, data = salary_data)
Residuals: Min 1Q Median 3Q Max -77424 -19404 -4635 15539 105391
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 84361.1 4941.4 17.072 < 2e-16 disciplineB 13033.8 2840.3 4.589 6.01e-06 yrs.service 832.2 110.2 7.550 3.07e-13 *** sexMale 8423.3 4744.5 1.775 0.0766 .
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ‘’ 1
Residual standard error: 27790 on 393 degrees of freedom Multiple R-squared: 0.1646, Adjusted R-squared: 0.1582 F-statistic: 25.81 on 3 and 393 DF, p-value: 2.928e-15
stargazer(salary_lm1, salary_lm2, salary_lm3, salary_lm4, type = "html")
| Dependent variable: | ||||
| salary | ||||
| (1) | (2) | (3) | (4) | |
| sexMale | 4,783.493 | 4,349.366 | 6,464.051 | 8,423.335* |
| (3,858.668) | (3,875.393) | (7,817.858) | (4,744.537) | |
| faculty.rankAsstProf:sexMale | -6,308.131 | |||
| (10,839.480) | ||||
| faculty.rankProf:sexMale | -874.006 | |||
| (9,603.828) | ||||
| faculty.rankAsstProf | -12,907.590*** | -13,104.150*** | -7,848.084 | |
| (4,145.278) | (4,167.315) | (10,016.190) | ||
| faculty.rankProf | 32,158.410*** | 32,928.400*** | 33,599.530*** | |
| (3,540.647) | (3,544.403) | (9,015.904) | ||
| disciplineB | 14,417.630*** | 13,937.470*** | 14,028.240*** | 13,033.850*** |
| (2,342.875) | (2,346.534) | (2,355.315) | (2,840.349) | |
| yrs.since.phd | 535.058** | 61.011 | 58.228 | |
| (240.994) | (127.010) | (127.640) | ||
| yrs.service | -489.516** | 832.154*** | ||
| (211.938) | (110.215) | |||
| Constant | 78,862.820*** | 80,988.470*** | 79,193.330*** | 84,361.070*** |
| (4,990.326) | (4,931.844) | (7,654.839) | (4,941.371) | |
| Observations | 397 | 397 | 397 | 397 |
| R2 | 0.455 | 0.447 | 0.448 | 0.165 |
| Adjusted R2 | 0.446 | 0.440 | 0.438 | 0.158 |
| Residual Std. Error | 22,538.650 (df = 390) | 22,663.240 (df = 391) | 22,708.750 (df = 389) | 27,789.810 (df = 393) |
| F Statistic | 54.195*** (df = 6; 390) | 63.266*** (df = 5; 391) | 45.071*** (df = 7; 389) | 25.810*** (df = 3; 393) |
| Note: | p<0.1; p<0.05; p<0.01 | |||